commit 510e829c1b43a608c3d9f6eba8268f282dc5ba0a
parent 92d1c32d8a252226d937743b2c5169c867818486
Author: William Casarin <jb55@jb55.com>
Date: Tue, 4 May 2021 03:38:39 -0700
bin: pandocweb
download a website before running it through pandoc
Diffstat:
A | bin/pandocweb | | | 51 | +++++++++++++++++++++++++++++++++++++++++++++++++++ |
1 file changed, 51 insertions(+), 0 deletions(-)
diff --git a/bin/pandocweb b/bin/pandocweb
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+PANDOC=${PANDOC:-pandoc}
+
+function onfail() {
+ printf "\n\nFAILED, output dir: $PWD\n\n"
+}
+
+function get_tmpname() {
+ hashpart=$(sha256sum <<<"$url" | awk '{print $1}')
+ printf "$PWD/out-${hashpart:0:8}.epub\n"
+}
+
+trap onfail ERR
+
+url="$1"
+# extract the protocol
+proto="$(<<<"$url" grep :// | sed -e's,^\(.*://\).*,\1,g')"
+noproto=$(<<<"$url" sed -e s,$proto,,g)
+path="$(<<<"$noproto" grep / | cut -d/ -f2-)"
+cdpath=$(basename "$path")
+
+out_name=$(readlink -f "${2:-"$(get_tmpname)"}")
+hostname=$(echo "$url" | awk -F[/:] '{print $4}')
+
+tmpdir=$(mktemp -d)
+
+cd "$tmpdir"
+
+wget \
+ --page-requisites \
+ --adjust-extension \
+ --span-hosts \
+ --convert-links \
+ --restrict-file-names=windows \
+ --domains "$hostname" \
+ --no-parent \
+ "$url" 1>&2
+
+target=$(find . -name '*.htm*' | grep . | head -n1)
+file=$(basename "$target")
+cd "$(dirname "$target")"
+printf "converting %s ... " "$file" 1>&2
+$PANDOC "$file" -o "$out_name" 1>&2
+printf "done. saved as:\n" "$out_name" 1>&2
+printf "%s\n" "$out_name"
+
+cd
+rm -rf "$tmpdir"