天天看點

2020爬取美賽數學成績

爬取美賽數學成績

1.下載下傳所有pdf

for ((i=1; i<=10000; i++))
do
    echo $i
    wget http://www.comap-math.com/mcm/2020Certs/$i.pdf
done
           

安裝convert

sudo  apt-get install imagemagick
           

安裝tesseract

sudo apt-get tesseract-ocr
           

4.輸出csv

#!/bin/bash
a1="Outst"
a2="Final"
a3="Merit"
a4="Honor"
a5="Succe"
a6="Unsuc"

for ((i=1; i<=1000000; i++))
do
    convert $i.pdf jpg/$i.jpg
    tesseract jpg/$i.jpg txt/$i
    r1=$(cat txt/$i.txt | grep "$a1")
    if [[ "$r1" != "" ]]; then
        echo "$i O" >> all.csv
    fi
    r2=$(cat txt/$i.txt | grep "$a2")
    if [[ "$r2" != "" ]]; then
        echo "$i F" >> all.csv
    fi
    r3=$(cat txt/$i.txt | grep "$a3")
    if [[ "$r3" != "" ]]; then
        echo "$i M" >> all.csv
    fi
    r4=$(cat txt/$i.txt | grep "$a4")
    if [[ "$r4" != "" ]]; then
        echo "$i H" >> all.csv
    fi
    r5=$(cat txt/$i.txt | grep "$a5")
    if [[ "$r5" != "" ]]; then
        echo "$i S" >> all.csv
    fi
    r6=$(cat txt/$i.txt | grep "$a6")
    if [[ "$r6" != "" ]]; then
        echo "$i U" >> all.csv
    fi
done
cat all.csv | tr -s '[:blank:]' ',' > end.csv

           

繼續閱讀