I started using JabRef recently and I like it. When I was going through CVPR 2018 proceedings, I thought why don’t I get all the papers into my database and then mark them over there. I searched over the Internet but didn’t found information about how to do it. Then I wrote the following scripts and posted here hoping it is useful to someone. I will add my fetched files later in this post as well. Stay tuned.

import os
from lxml import html
import urllib

import requests

page = requests.get('http://openaccess.thecvf.com/CVPR2018.py')
tree = html.fromstring(page.content)

links_files = []
pdfs = []
bibtex_strs = []

# the second number is 2 x number of papers. I still haven't found a way to get this number automatically
# but it is one time thing per conference.
for idx in range(2,1959, 2):
    pdf_links = tree.xpath('//*[@id="content"]/dl/dd[%d]/a/@href'%idx)

    link = 'http://openaccess.thecvf.com/'+pdf_links[0]
    bib_info = tree.xpath('//*[@id="content"]/dl/dd[%d]/div/div/text()'%idx)
    bibkey = bib_info[0].split('{')[1].replace(',', '').strip()
    pdf_name = 'cvpr2018/'+bibkey + '_' + os.path.basename(pdf_links[0])
    links_files.append([link, pdf_name])
    bib_info.insert(6, ',\nfile      = {:%s:PDF},'%pdf_name)
    bib_info.insert(7, '\ngroups    = {cvpr2018}')
    bibtex_str = ''.join(bib_info)

joint_bib_string = ''.join(bibtex_strs)

joint_bib_string = joint_bib_string.encode('utf-8')

with open('cvpr2018.bib', 'w') as bib_file:

# download
cnt = 1
num = 1959/2
for link, pdf_name in links_files:
    print '['+str(cnt)+'/'+str(num)+"]  Downloading -> "+link
        urllib.urlretrieve(link, pdf_name)
    except Exception,e:
    cnt = cnt + 1