citadel

My dotfiles, scripts and nix configs
git clone git://jb55.com/citadel
Log | Files | Refs | README | LICENSE

commit 510e829c1b43a608c3d9f6eba8268f282dc5ba0a
parent 92d1c32d8a252226d937743b2c5169c867818486
Author: William Casarin <jb55@jb55.com>
Date:   Tue,  4 May 2021 03:38:39 -0700

bin: pandocweb

download a website before running it through pandoc

Diffstat:
Abin/pandocweb | 51+++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 51 insertions(+), 0 deletions(-)

diff --git a/bin/pandocweb b/bin/pandocweb @@ -0,0 +1,51 @@ +#!/usr/bin/env bash + +set -eou pipefail + +PANDOC=${PANDOC:-pandoc} + +function onfail() { + printf "\n\nFAILED, output dir: $PWD\n\n" +} + +function get_tmpname() { + hashpart=$(sha256sum <<<"$url" | awk '{print $1}') + printf "$PWD/out-${hashpart:0:8}.epub\n" +} + +trap onfail ERR + +url="$1" +# extract the protocol +proto="$(<<<"$url" grep :// | sed -e's,^\(.*://\).*,\1,g')" +noproto=$(<<<"$url" sed -e s,$proto,,g) +path="$(<<<"$noproto" grep / | cut -d/ -f2-)" +cdpath=$(basename "$path") + +out_name=$(readlink -f "${2:-"$(get_tmpname)"}") +hostname=$(echo "$url" | awk -F[/:] '{print $4}') + +tmpdir=$(mktemp -d) + +cd "$tmpdir" + +wget \ + --page-requisites \ + --adjust-extension \ + --span-hosts \ + --convert-links \ + --restrict-file-names=windows \ + --domains "$hostname" \ + --no-parent \ + "$url" 1>&2 + +target=$(find . -name '*.htm*' | grep . | head -n1) +file=$(basename "$target") +cd "$(dirname "$target")" +printf "converting %s ... " "$file" 1>&2 +$PANDOC "$file" -o "$out_name" 1>&2 +printf "done. saved as:\n" "$out_name" 1>&2 +printf "%s\n" "$out_name" + +cd +rm -rf "$tmpdir"