Skip to content

Commit ff97d5a

Browse files
committed
Avoid generating derivitives of datacite xml files
1 parent d164a92 commit ff97d5a

File tree

1 file changed

+103
-102
lines changed

1 file changed

+103
-102
lines changed

caltech_thesis.py

Lines changed: 103 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -16,134 +16,135 @@ def cleanhtml(raw_html):
1616

1717
files = glob.glob('*.xml')
1818
for f in files:
19-
with open(f) as fd:
20-
eprint = xmltodict.parse(fd.read())['eprints']['eprint']
21-
print(eprint['title'])
19+
if 'datacite' not in f:
20+
with open(f) as fd:
21+
eprint = xmltodict.parse(fd.read())['eprints']['eprint']
22+
print(eprint['title'])
2223

23-
metadata = {}
24+
metadata = {}
2425

25-
#Transforming Metadata
26-
#Creators
27-
newa = []
28-
info = eprint['creators']['item']
29-
new = {}
30-
new['affiliations'] = ["California Institute of Technology"]
31-
if 'orcid' in info:
32-
idv = []
33-
nid = {}
34-
nid['nameIdentifier'] = info['orcid']
35-
nid['nameIdentifierScheme'] ='ORCID'
36-
idv.append(nid)
37-
new['nameIdentifiers']=idv
38-
name = info['name']
39-
new['creatorName'] = name['family']+','+name['given']
40-
new['givenName'] = name['given']
41-
new['familyName'] = name['family']
42-
newa.append(new)
26+
#Transforming Metadata
27+
#Creators
28+
newa = []
29+
info = eprint['creators']['item']
30+
new = {}
31+
new['affiliations'] = ["California Institute of Technology"]
32+
if 'orcid' in info:
33+
idv = []
34+
nid = {}
35+
nid['nameIdentifier'] = info['orcid']
36+
nid['nameIdentifierScheme'] ='ORCID'
37+
idv.append(nid)
38+
new['nameIdentifiers']=idv
39+
name = info['name']
40+
new['creatorName'] = name['family']+','+name['given']
41+
new['givenName'] = name['given']
42+
new['familyName'] = name['family']
43+
newa.append(new)
4344

44-
metadata['creators'] = newa
45-
metadata['titles'] = [{'title':eprint['title']}]
46-
metadata['publisher'] = "California Institute of Technology"
47-
metadata['publicationYear'] = eprint['date']
48-
metadata['resourceType']={"resourceType":\
45+
metadata['creators'] = newa
46+
metadata['titles'] = [{'title':eprint['title']}]
47+
metadata['publisher'] = "California Institute of Technology"
48+
metadata['publicationYear'] = eprint['date']
49+
metadata['resourceType']={"resourceType":\
4950
"Dissertation ("+eprint['thesis_degree']+")",'resourceTypeGeneral':"Text"}
5051

51-
if 'doi' in eprint:
52-
metadata['identifier'] = {'identifier':eprint['doi'],'identifierType':"DOI"}
52+
if 'doi' in eprint:
53+
metadata['identifier'] = {'identifier':eprint['doi'],'identifierType':"DOI"}
5354

54-
metadata['descriptions'] =[{'descriptionType':"Abstract",\
55+
metadata['descriptions'] =[{'descriptionType':"Abstract",\
5556
'description':cleanhtml(eprint['abstract'])}]
56-
metadata['formats'] = ['PDF']
57-
metadata['version'] = 'Final'
58-
metadata['language'] = 'English'
57+
metadata['formats'] = ['PDF']
58+
metadata['version'] = 'Final'
59+
metadata['language'] = 'English'
5960

60-
#Subjects
61-
if "keywords" in eprint:
62-
subjects = eprint['keywords'].split(';')
63-
if len(subjects) == 1:
64-
subjects = eprint['keywords'].split(',')
65-
array = []
66-
for s in subjects:
67-
array.append({'subject':s.strip()})
68-
metadata['subjects']=array
69-
if 'option_major' in eprint:
70-
if isinstance(eprint['option_major']['item'],list):
71-
for item in eprint['option_major']['item']:
72-
text = thesis_subjects[item]
61+
#Subjects
62+
if "keywords" in eprint:
63+
subjects = eprint['keywords'].split(';')
64+
if len(subjects) == 1:
65+
subjects = eprint['keywords'].split(',')
66+
array = []
67+
for s in subjects:
68+
array.append({'subject':s.strip()})
69+
metadata['subjects']=array
70+
if 'option_major' in eprint:
71+
if isinstance(eprint['option_major']['item'],list):
72+
for item in eprint['option_major']['item']:
73+
text = thesis_subjects[item]
74+
metadata['subjects'].append({'subject':text})
75+
else:
76+
text = thesis_subjects[eprint['option_major']['item']]
7377
metadata['subjects'].append({'subject':text})
74-
else:
75-
text = thesis_subjects[eprint['option_major']['item']]
76-
metadata['subjects'].append({'subject':text})
77-
if 'option_minor' in eprint:
78-
if isinstance(eprint['option_minor']['item'],list):
79-
for item in eprint['option_minor']['item']:
80-
text = thesis_subjects[item]
78+
if 'option_minor' in eprint:
79+
if isinstance(eprint['option_minor']['item'],list):
80+
for item in eprint['option_minor']['item']:
81+
text = thesis_subjects[item]
82+
metadata['subjects'].append({'subject':text})
83+
else:
84+
text = theis_subjects[eprint['option_minor']['item']]
8185
metadata['subjects'].append({'subject':text})
82-
else:
83-
text = theis_subjects[eprint['option_minor']['item']]
84-
metadata['subjects'].append({'subject':text})
8586

86-
if 'funders' in eprint:
87-
array = []
88-
if isinstance(eprint['funders']['item'],list):
89-
for item in eprint['funders']['item']:
87+
if 'funders' in eprint:
88+
array = []
89+
if isinstance(eprint['funders']['item'],list):
90+
for item in eprint['funders']['item']:
91+
award = {}
92+
award['funderName'] = item['agency']
93+
if 'grant_number' in item:
94+
award['awardNumber'] = {'awardNumber':item['grant_number']}
95+
array.append(award)
96+
else:
97+
item = eprint['funders']['item']
9098
award = {}
9199
award['funderName'] = item['agency']
92100
if 'grant_number' in item:
93101
award['awardNumber'] = {'awardNumber':item['grant_number']}
94102
array.append(award)
95-
else:
96-
item = eprint['funders']['item']
97-
award = {}
98-
award['funderName'] = item['agency']
99-
if 'grant_number' in item:
100-
award['awardNumber'] = {'awardNumber':item['grant_number']}
101-
array.append(award)
102-
metadata['fundingReferences'] = array
103+
metadata['fundingReferences'] = array
103104

104-
if 'rights' in eprint:
105-
metadata['rightsList'] = [{'rights':eprint['rights']}]
105+
if 'rights' in eprint:
106+
metadata['rightsList'] = [{'rights':eprint['rights']}]
106107

107-
if 'related_url' in eprint:
108-
array = []
109-
if isinstance(eprint['related_url']['item'],list):
110-
for item in eprint['related_url']['item']:
108+
if 'related_url' in eprint:
109+
array = []
110+
if isinstance(eprint['related_url']['item'],list):
111+
for item in eprint['related_url']['item']:
112+
if 'CaltechDATA' in item['description']:
113+
obj = {}
114+
obj['relationType']='IsSupplementedBy'
115+
obj['relatedIdentifierType']='DOI'
116+
obj['relatedIdentifier']=item['url']
117+
array.append(obj)
118+
else:
119+
item = eprint['related_url']['item']
111120
if 'CaltechDATA' in item['description']:
112121
obj = {}
113122
obj['relationType']='IsSupplementedBy'
114123
obj['relatedIdentifierType']='DOI'
115124
obj['relatedIdentifier']=item['url']
116125
array.append(obj)
117-
else:
118-
item = eprint['related_url']['item']
119-
if 'CaltechDATA' in item['description']:
120-
obj = {}
121-
obj['relationType']='IsSupplementedBy'
122-
obj['relatedIdentifierType']='DOI'
123-
obj['relatedIdentifier']=item['url']
124-
array.append(obj)
125-
metadata['relatedIdentifiers']=array
126+
metadata['relatedIdentifiers']=array
126127

127-
#Dates
128-
dates = []
129-
dates.append({"date":datetime.date.today().isoformat(),"dateType":"Issued"})
130-
if 'gradofc_approval_date' in eprint:
131-
dates.append({"date":eprint['gradofc_approval_date'],"dateType":"Accepted"})
132-
#These are scanned records, we just list when they were made available
133-
else:
134-
dates.append({"date":eprint['datestamp'],"dateType":"Available"})
135-
metadata['dates'] = dates
128+
#Dates
129+
dates = []
130+
dates.append({"date":datetime.date.today().isoformat(),"dateType":"Issued"})
131+
if 'gradofc_approval_date' in eprint:
132+
dates.append({"date":eprint['gradofc_approval_date'],"dateType":"Accepted"})
133+
#These are scanned records, we just list when they were made available
134+
else:
135+
dates.append({"date":eprint['datestamp'],"dateType":"Available"})
136+
metadata['dates'] = dates
136137

137-
assert schema40.validate(metadata)
138-
#Debugging if this fails
139-
#v = schema40.validator.validate(metadata)
140-
#errors = sorted(v.iter_errors(instance), key=lambda e: e.path)
141-
#for error in errors:
142-
# print(error.message)
138+
assert schema40.validate(metadata)
139+
#Debugging if this fails
140+
#v = schema40.validator.validate(metadata)
141+
#errors = sorted(v.iter_errors(instance), key=lambda e: e.path)
142+
#for error in errors:
143+
# print(error.message)
143144

144-
xml = schema40.tostring(metadata)
145+
xml = schema40.tostring(metadata)
145146

146-
outname = f.split('.xml')[0]+'_datacite.xml'
147-
outfile = open(outname,'w')
148-
outfile.write(xml)
147+
outname = f.split('.xml')[0]+'_datacite.xml'
148+
outfile = open(outname,'w')
149+
outfile.write(xml)
149150

0 commit comments

Comments
 (0)