爬取美賽數學成績
1.下載下傳所有pdf
for ((i=1; i<=10000; i++))
do
echo $i
wget http://www.comap-math.com/mcm/2020Certs/$i.pdf
done
安裝convert
sudo apt-get install imagemagick
安裝tesseract
sudo apt-get tesseract-ocr
4.輸出csv
#!/bin/bash
a1="Outst"
a2="Final"
a3="Merit"
a4="Honor"
a5="Succe"
a6="Unsuc"
for ((i=1; i<=1000000; i++))
do
convert $i.pdf jpg/$i.jpg
tesseract jpg/$i.jpg txt/$i
r1=$(cat txt/$i.txt | grep "$a1")
if [[ "$r1" != "" ]]; then
echo "$i O" >> all.csv
fi
r2=$(cat txt/$i.txt | grep "$a2")
if [[ "$r2" != "" ]]; then
echo "$i F" >> all.csv
fi
r3=$(cat txt/$i.txt | grep "$a3")
if [[ "$r3" != "" ]]; then
echo "$i M" >> all.csv
fi
r4=$(cat txt/$i.txt | grep "$a4")
if [[ "$r4" != "" ]]; then
echo "$i H" >> all.csv
fi
r5=$(cat txt/$i.txt | grep "$a5")
if [[ "$r5" != "" ]]; then
echo "$i S" >> all.csv
fi
r6=$(cat txt/$i.txt | grep "$a6")
if [[ "$r6" != "" ]]; then
echo "$i U" >> all.csv
fi
done
cat all.csv | tr -s '[:blank:]' ',' > end.csv