Right. So here it is. Tailor-made for my specific needs. Kind of.
Or not.
Quote:Please correct the following errors before continuing:
The type of file that you attached is not allowed. Please remove the attachment or choose a different type.
Wall o' text:
Code:
#!/bin/bash
declare -a SuffixList=( ".0" ".1" ".2" ".3" ".4" ".5" ".6" ".7" ".8" ".9" ".BEY" ".NOW" ".INH" )
declare -a PrefixList=( T v )
declare -A Series=( [Comic]="" [ThisYear]="" [NextYear]="" \
[Status]="" [Prefix]="" )
declare -a Issues=("")
declare -a ExceptionTitles=( 'Area 10' 'Area 510' '^The New 52$' )
# ? Because otherwise the algorithm would see Area 10 and Area 510 as two
# issues of the same series instead of two different books, and would
# collapse all 100+ issues of The New 52 - Futures End event into a single
# "The New 52"
Date="$(date '+%F - %T')"
ARCHIVE="./"
OutputFile="Condensed List - $Date"
ComicList="ComicList"
YEAR=' \(((19[0-9]{2}|20[0-1][0-9]|202[0-5])|(19[0-9]{2}|20[0-1][0-9]|202[0-5])(-|\,)[0-9]{2}|(19[0-9]{2}|20[0-1][0-9]|202[0-5])(-|\,)(19[0-9]{2}|20[0-1][0-9]|202[0-5]))\)'
declare WorkingComic="" Title="" Issue="" Prefix="" Comic="" LastWord=""
declare Quiet=1 OutputWidth=40
while getopts ":d:i:o:qw:-" OPTION; do
case $OPTION in
d) [[ -d "$OPTARG" ]] && ARCHIVE="$OPTARG" ;; #Location of output
i) [[ -f "$OPTARG" ]] && ComicList="$OPTARG" ;; #Name of input file
o) OutputFile="$OPTARG" ;; #Kinda obvious
q) Quiet=0 ;; #Supress progress to stdout
w) [[ "$OPTARG" =~ ^[0-9]+$ ]] && OutputWidth=$OPTARG ;; #Keeps long titles short
-) break ;;
*) echo "Unrecognized option: $OPTION"; exit ;;
esac
done
shift $(( OPTIND - 1 ))
Extract_Year() { echo `grep -E -o "$1" <<<"$2"`;}
Get_Last_Word() { [[ ${1: -1} == ')' ]] && echo "(${1##*'('}" || echo "${1##* }"; }
Remove_Cruft() { # ref UnCrufted
local -n UnCrufted="$1"
local Word="$(Get_Last_Word "$UnCrufted")"
local Cruft='\(.*\)'
while [[ "$Word" =~ $Cruft ]]; do
UnCrufted="${Comic% $Word}"
Word="$(Get_Last_Word "$UnCrufted")"
done
}
Add_Issue() { # ref List, Element, ref Group
local -n List=$1
local Element=$2
local -n Group=$3
List+=($Element)
Group[Status]="Processed"
}
Is_Issue_Number() { # Word, ref Number, ref Prefix, Title
### local functions ###
Has_Suffix() { [[ "${SuffixList[*]}" =~ ${1##*.} ]] && return 0 || return 1; }
Not_an_Exception() { #Not all trailing numbers are issue numbers
local Exception='( [0-9][0-9]+(\.[0-9])? -[-]? .* [v]?[0-9][0-9]+)|( [0-9][0-9]+(\.[0-9])? -[-]? [.* ]?[v]?[0-9][0-9]+)'
[[ "$1" =~ $Exception ]] && return 1 || return 0; }
Exception_to_the_Exception() { #Because, of course there are.
local ExceptionToException='(^Marvel 1602|^Spider-Man 2099|^The New 52|^James Bond 007|^James Bond - 007|Book [0-9]+ [-]?- [.* ]?[0-9]+$)'
[[ "$1" =~ $ExceptionToException ]] && return 0 || return 1; }
local Word="$1"
local -n Number=$2
local -n PrefixRtn=$3
local Title="$4"
local ThisPrefix="" Suffix=""
### Quick Checks ###
[[ "${ExceptionTitles[*]}" =~ $Title ]] && Number="" && PrefixRtn="" && return 1
[[ "$Word" == '-1' ]] && Number="$Word" && PrefixRtn="" && return 0
[[ ${#Word} == 2 && "${Word}" =~ ^(T|v)[0-9]$ ]] \
&& Number="${Word:1}" && PrefixRtn="${Word::1}" && return 0
[[ ${#Word} == 1 ]] && Number="" && PrefixRtn="" && return 1
[[ ! 0123456789 =~ ${Word::1} && ! "${PrefixList[*]}" =~ ${Word::1} ]] \
&& Number="" && PrefixRtn="" && return 1
### Pre-processing ###
[[ "${PrefixList[*]}" =~ ${Word::1} ]] && ThisPrefix="${Word::1}" && Word="${Word:1}"
if Has_Suffix "$Word"; then Suffix=".${Word#*.}"; Word="${Word/$Suffix/}"; fi
### Issue numbers do not have both prefix and suffix ###
[[ ! "$Suffix" == "" && ! "$ThisPrefix" == "" ]] \
&& Number="" && PrefixRtn="" && return 1
if [[ $Word =~ ^[0-9]+$ ]]; then
if Not_an_Exception "$Title"; then
Number="$Word"; PrefixRtn="$ThisPrefix"; return 0
elif Exception_to_the_Exception "$Title"; then
Number="$Word"; PrefixRtn="$ThisPrefix"; return 0
else
Number="";PrefixRtn="";return 1
fi
else
Number="";PrefixRtn="";return 1
fi
}
Process_Comics() { # ref SeriesInfo, ref IssueList, NextSeries, NextIssue, NextPrefix
Show_Progress() { # Output
local Output="$1"; Output="${Output::$OutputWidth}"
tput rc; tput el; printf '%s' "$Output"; }
local -n SeriesInfo=$1
local -n IssueList=$2
local NextTitle="$3"
local NextIssue="$4"
local NextPrefix="$5"
local Title="${SeriesInfo[Title]}"; SeriesInfo[Title]="$NextTitle"
local Prefix=${SeriesInfo[Prefix]}; SeriesInfo[Prefix]="$NextPrefix"
local ThisYear="${SeriesInfo[ThisYear]}";SeriesInfo[ThisYear]="${SeriesInfo[NextYear]}"
local First="" Last="" SingleIssue=""
(( ${#IssueList[@]} < 2 )) && SingleIssue="true"
if [[ -n ${IssueList[0]} ]]; then
First="${IssueList[0]}"; Last="${IssueList[0]}"; unset 'IssueList[0]'; fi
if [[ "$Title" == "" ]]; then
:
else
if [[ $SingleIssue == 'true' ]]; then
[[ ! $First == "" ]] && Title="${Title} ${Prefix}$First"
[[ ! $ThisYear == "" ]] && Title="${Title} $ThisYear"
printf '%s\n' "$Title" >> "$OutputFile"
else
printf '%s ' "$Title" >> "$OutputFile"
for i in ${!IssueList[@]}; do
if [[ ${IssueList[$i]} == "" ]]; then
unset 'IssueList[$i]'
elif (( 10#${IssueList[$i]} == 10#$Last ||
10#${IssueList[$i]} == $(( 10#$Last + 1 )) )); then
Last=${IssueList[$i]}
unset 'IssueList[$i]'
else
(( 10#$First < 10#$Last )) && printf '%s%s-%s%s,' \
"$Prefix" "$First" "$Prefix" "$Last"\
>> "$OutputFile" \
|| printf '%s%s,' "$Prefix" "$First" \
>> "$OutputFile"
First=${IssueList[$i]}; Last=${IssueList[$i]}
unset 'IssueList[$i]'
fi ## IssueList[] ==
done ## for i
(( 10#$First < 10#$Last )) && printf '%s%s-%s%s\n' \
"$Prefix" "$First" "$Prefix" "$Last" \
>> "$OutputFile" \
|| printf '%s%s\n' "$Prefix" "$First" \
>> "$OutputFile"
fi ## SingleIssue ==
fi ## Title ==
IssueList=(); Add_Issue IssueList "$NextIssue" SeriesInfo
Title="$NextTitle"
[[ "$Quiet" == 1 ]] && Show_Progress "$NextTitle"
}
Crop_Comic() {
local OldComic="$1"
local Single="${OldComic%\ -\ *}"
local Double="${OldComic%\ --\ *}"
[[ "$Single" == "$Double" ]] && echo "$OldComic" && return
[[ "$Double" == "$OldComic" ]] && echo "$Single" && return
[[ "$Single" == "$OldComic" ]] && echo "$Double" && return
(( ${#Single} > ${#Double} )) && echo "$Single" || echo "$Double"
}
### main()
tput sc
while IFS= read -r Comic; do
Series[Status]="NotProcessed"
Series[NextYear]=$( Extract_Year "$YEAR" "$Comic" )
Remove_Cruft Comic
WorkingComic="$Comic"
while [[ ${Series[Status]} == "NotProcessed" ]]; do
LastWord="$( Get_Last_Word "$WorkingComic" )"
if Is_Issue_Number "$LastWord" Issue Prefix "$WorkingComic"; then
Title="${WorkingComic% "$LastWord"*}"
if [[ "$Title" == "${Series[Title]}" ]]; then
Series[Prefix]="$Prefix"
Add_Issue Issues "$Issue" Series
else
Process_Comics Series Issues "$Title" "$Issue" "$Prefix"
fi
else
Cropped="$(Crop_Comic "$WorkingComic")"
if [[ "$Cropped" == "$WorkingComic" ]];then
Process_Comics Series Issues "$Comic" "" ""
else
WorkingComic="$Cropped"
fi
fi
done
done < "$ComicList"
### Still need to process last comic ###
Process_Comics Series Issues "" "" ""
[[ "$Quiet" == 1 ]] && tput rc && tput el
Let's see if I can attach ComicList.
Nope. First it didn't like the type of file, so I added a .txt extension, and now it doesn't like the 1.2 MiB size.
Anyway.
As you can see from the crunchbang, this is a bash script. Bash is a scripting language, not a programming programming language. Trying to use a scripting language as a programming language often leads to frustration, tears, and heartbreak. If you're lucky. If you aren't, well let's not get into it. First issue: all variables are global in scope unless declared local. But all variables are available to any subprocesses called, whether declared local or not. Second issue: Arrays in bash aren't arrays. They are special space- and/or \0-delimited strings. Of no fixed size. And the indices do not need to be sequential. The relatively new associative array (bash4.2+ I think) just adds more fun to the party. Third issue: variables and arrays do not need to be declared. You want new variable? Just assign a value to it. Frustrating when you can't figure out why
echo "$Abbrevation" keeps coming up empty when you know for a fact that you made the
Abbreviation="dh" assignment the line before.
Getting to the script. Yes, I use UpperCamelCase. Deal with it. A remnant from learning Pascal in college. I like to visually differentiate variables from system commands. I use Snake_Case for function names, where appropriate. These are terms I've only recently learned.
Nested functions: Only used as a reminder of where a sub-function is called. Functions, like variables, are global, limited only in that a function must either be defined in the text of the script before it is called or has been instantiated
testfunction:
Code:
#!/bin/bash
Outer_Function 1
Outer_Function() {
Inner_Function 2
Inner_Function() { echo "$1"; }
Inner_Function "$1"
}
Inner_Function 3
Outer_Function 4
Inner_Function 5
Output:
testfunction: line 2: Outer_Function: command not found
testfunction: line 8: Inner_Function: command not found
testfunction: line 4: Inner_Function: command not found
4
5
As you can see, once Outer_Function was called, Inner_Function became a thing to the shell and is now globally available.
I only just, as in within the last 5 minutes, learned that it is possible to make function variables and nested functions truly local by running the function in a subshell --
Function() ( code; ) -- instead of in the current shell --
Function() { code; } -- but that's something I'd have to look into regarding passing arrays by "reference", and something that I probably wouldn't have used for this script because I'm using global variable side-effects that would set my programming instructors' teeth on edge.
Is_Issue_Number(): In bash, and presumably sh, zsh, fish, etc., when a function exits, it stores the return value (whether the function ran correctly or not, 0 is true) in the ? variable which is used by 'if' to determine true/false.
Code:
if Is_Issue_Number "$LastWord" Issue Prefix "$WorkingComic"; then ...
and
Is_Issue_Number "$LastWord" Issue Prefix "$WorkingComic"
if [[ $? ]]; then ...
The above are equivalent. The return value can be set. Don't know about you, but I was taught that one does NOT use functions to change the values of parameters, yet that is exactly what I'm doing with Is_Issue_Number. I'm using the true/false-ness for the if-then branching while also setting the Issue and Prefix variables. Yes, I'm somewhat mitigating undesired side-effects by using the local -n switch ("This variable is now called this other name."), but I'd still be docked at least a few points if I handed something like this in for homework.
Whatever.
The idea of the script is that anything between parentheses at the end of the comic book string, of which there may be many or none, is unneeded. Except for the year. The year might be needed, so record the year. Issue numbers will be the last word (characters surrounded by white space) in the string, followed by parentheticals, or followed by one or two dashes ( -/-- ). If you find an issue number and the current title (everything before the issue number) is the same as the last comic you checked, then record the issue number and get a new comic. If it's a different title, then print the old title, all the issue numbers, and the year for that title if there aren't two or more issue numbers. If you do not find an issue number, then this title is by default different from the previous title. Rinse, repeat.
Printing out issue numbers: assign first element in array to First and Last. Going through the array, if each number is equal to or one greater than the previous number, then increase Last by 1, otherwise print "$First-$Last," (or "$Last," if equal) (Why $Last and not $First? One fewer characters to type); assign this number to First and Last, and continue through the array. When at the end of the array, do one final printing for the title.
That's it. All the rest is taking care of exceptions to the rules.
Example: sometimes the last number in a string isn't the issue number for the title. Like "Lovecraft Adaptations 02 - HPL 1920". The previous book is "Lovecraft Adaptations 01 - Beyond the Wall of Sleep", so obviously 1920 isn't an issue number.
Or look at Batman. "Batman 910" isn't "Batman 910", it's "Batman v3 145". But you know I'm not going to use "Batman v3 145" because fuck you Warner Brothers, I'm going to use "Batman 910". But the cover reads, "Batman v3 145". So I record it as "Batman 910 - v3 145". That means when the script encounters something like "Batman 910 - v3 145" or "Lovecraft Adaptations 02 - HPL 1920" it needs to check if there is a number followed by a dash (or two) further left in the string.
Good job, everyone. Well done. Great.
But what about "Spider-Man 2099 - Dark Genesis 01"? Whoops. So now we need to check for exceptions to the exceptions.
Weakness of the script: Garbage in/garbage out. The script reads one line at a time looking neither forward nor backward in the input file. If the file isn't well-sorted, the output is going to be a mess.
Let's take a look at a couple Batman titles: Batman Family and Batman - Family. Using the sort command, we get
Batman Family 01
Batman - Family 01
Batman Family 02
Batman - Family 02
etc.
Using the -V switch results in
Batman Family 01
Batman Family 02
...
Batman - Family 01
Batman - Family 02
...
Good. That's what we want. But it does nothing for Batman and Batman 80-Page Giant:
Batman 043
Batman 047
Batman 80-Page Giant 01
Batman 80-Page Giant 02
Batman 80-Page Giant 03
Batman 80-Page Giant (2010)
Batman 80-Page Giant (2011)
Batman 081
Batman 085, etc.
Same with Alien and Alien 3; Batman Beyond and Batman 2.0; Harbinger Wars and Harbinger Wars 2; Marvel Zombies, Marvel Zombies 2, 3, 4, and 5.
But it's late again. On the 50,000+ line file, it takes almost 10 minutes. Did I mention that bash is slow as hell? Bash is slow as hell.