@@ -16,134 +16,135 @@ def cleanhtml(raw_html):
1616
1717files = glob .glob ('*.xml' )
1818for f in files :
19- with open (f ) as fd :
20- eprint = xmltodict .parse (fd .read ())['eprints' ]['eprint' ]
21- print (eprint ['title' ])
19+ if 'datacite' not in f :
20+ with open (f ) as fd :
21+ eprint = xmltodict .parse (fd .read ())['eprints' ]['eprint' ]
22+ print (eprint ['title' ])
2223
23- metadata = {}
24+ metadata = {}
2425
25- #Transforming Metadata
26- #Creators
27- newa = []
28- info = eprint ['creators' ]['item' ]
29- new = {}
30- new ['affiliations' ] = ["California Institute of Technology" ]
31- if 'orcid' in info :
32- idv = []
33- nid = {}
34- nid ['nameIdentifier' ] = info ['orcid' ]
35- nid ['nameIdentifierScheme' ] = 'ORCID'
36- idv .append (nid )
37- new ['nameIdentifiers' ]= idv
38- name = info ['name' ]
39- new ['creatorName' ] = name ['family' ]+ ',' + name ['given' ]
40- new ['givenName' ] = name ['given' ]
41- new ['familyName' ] = name ['family' ]
42- newa .append (new )
26+ #Transforming Metadata
27+ #Creators
28+ newa = []
29+ info = eprint ['creators' ]['item' ]
30+ new = {}
31+ new ['affiliations' ] = ["California Institute of Technology" ]
32+ if 'orcid' in info :
33+ idv = []
34+ nid = {}
35+ nid ['nameIdentifier' ] = info ['orcid' ]
36+ nid ['nameIdentifierScheme' ] = 'ORCID'
37+ idv .append (nid )
38+ new ['nameIdentifiers' ]= idv
39+ name = info ['name' ]
40+ new ['creatorName' ] = name ['family' ]+ ',' + name ['given' ]
41+ new ['givenName' ] = name ['given' ]
42+ new ['familyName' ] = name ['family' ]
43+ newa .append (new )
4344
44- metadata ['creators' ] = newa
45- metadata ['titles' ] = [{'title' :eprint ['title' ]}]
46- metadata ['publisher' ] = "California Institute of Technology"
47- metadata ['publicationYear' ] = eprint ['date' ]
48- metadata ['resourceType' ]= {"resourceType" :\
45+ metadata ['creators' ] = newa
46+ metadata ['titles' ] = [{'title' :eprint ['title' ]}]
47+ metadata ['publisher' ] = "California Institute of Technology"
48+ metadata ['publicationYear' ] = eprint ['date' ]
49+ metadata ['resourceType' ]= {"resourceType" :\
4950 "Dissertation (" + eprint ['thesis_degree' ]+ ")" ,'resourceTypeGeneral' :"Text" }
5051
51- if 'doi' in eprint :
52- metadata ['identifier' ] = {'identifier' :eprint ['doi' ],'identifierType' :"DOI" }
52+ if 'doi' in eprint :
53+ metadata ['identifier' ] = {'identifier' :eprint ['doi' ],'identifierType' :"DOI" }
5354
54- metadata ['descriptions' ] = [{'descriptionType' :"Abstract" ,\
55+ metadata ['descriptions' ] = [{'descriptionType' :"Abstract" ,\
5556 'description' :cleanhtml (eprint ['abstract' ])}]
56- metadata ['formats' ] = ['PDF' ]
57- metadata ['version' ] = 'Final'
58- metadata ['language' ] = 'English'
57+ metadata ['formats' ] = ['PDF' ]
58+ metadata ['version' ] = 'Final'
59+ metadata ['language' ] = 'English'
5960
60- #Subjects
61- if "keywords" in eprint :
62- subjects = eprint ['keywords' ].split (';' )
63- if len (subjects ) == 1 :
64- subjects = eprint ['keywords' ].split (',' )
65- array = []
66- for s in subjects :
67- array .append ({'subject' :s .strip ()})
68- metadata ['subjects' ]= array
69- if 'option_major' in eprint :
70- if isinstance (eprint ['option_major' ]['item' ],list ):
71- for item in eprint ['option_major' ]['item' ]:
72- text = thesis_subjects [item ]
61+ #Subjects
62+ if "keywords" in eprint :
63+ subjects = eprint ['keywords' ].split (';' )
64+ if len (subjects ) == 1 :
65+ subjects = eprint ['keywords' ].split (',' )
66+ array = []
67+ for s in subjects :
68+ array .append ({'subject' :s .strip ()})
69+ metadata ['subjects' ]= array
70+ if 'option_major' in eprint :
71+ if isinstance (eprint ['option_major' ]['item' ],list ):
72+ for item in eprint ['option_major' ]['item' ]:
73+ text = thesis_subjects [item ]
74+ metadata ['subjects' ].append ({'subject' :text })
75+ else :
76+ text = thesis_subjects [eprint ['option_major' ]['item' ]]
7377 metadata ['subjects' ].append ({'subject' :text })
74- else :
75- text = thesis_subjects [ eprint ['option_major ' ]['item' ]]
76- metadata [ 'subjects' ]. append ({ 'subject' : text })
77- if 'option_minor' in eprint :
78- if isinstance ( eprint [ 'option_minor' ][ 'item' ], list ):
79- for item in eprint [ 'option_minor' ][ 'item' ] :
80- text = thesis_subjects [ item ]
78+ if 'option_minor' in eprint :
79+ if isinstance ( eprint ['option_minor ' ]['item' ], list ):
80+ for item in eprint [ 'option_minor' ][ 'item' ]:
81+ text = thesis_subjects [ item ]
82+ metadata [ 'subjects' ]. append ({ 'subject' : text })
83+ else :
84+ text = theis_subjects [ eprint [ 'option_minor' ][ ' item' ] ]
8185 metadata ['subjects' ].append ({'subject' :text })
82- else :
83- text = theis_subjects [eprint ['option_minor' ]['item' ]]
84- metadata ['subjects' ].append ({'subject' :text })
8586
86- if 'funders' in eprint :
87- array = []
88- if isinstance (eprint ['funders' ]['item' ],list ):
89- for item in eprint ['funders' ]['item' ]:
87+ if 'funders' in eprint :
88+ array = []
89+ if isinstance (eprint ['funders' ]['item' ],list ):
90+ for item in eprint ['funders' ]['item' ]:
91+ award = {}
92+ award ['funderName' ] = item ['agency' ]
93+ if 'grant_number' in item :
94+ award ['awardNumber' ] = {'awardNumber' :item ['grant_number' ]}
95+ array .append (award )
96+ else :
97+ item = eprint ['funders' ]['item' ]
9098 award = {}
9199 award ['funderName' ] = item ['agency' ]
92100 if 'grant_number' in item :
93101 award ['awardNumber' ] = {'awardNumber' :item ['grant_number' ]}
94102 array .append (award )
95- else :
96- item = eprint ['funders' ]['item' ]
97- award = {}
98- award ['funderName' ] = item ['agency' ]
99- if 'grant_number' in item :
100- award ['awardNumber' ] = {'awardNumber' :item ['grant_number' ]}
101- array .append (award )
102- metadata ['fundingReferences' ] = array
103+ metadata ['fundingReferences' ] = array
103104
104- if 'rights' in eprint :
105- metadata ['rightsList' ] = [{'rights' :eprint ['rights' ]}]
105+ if 'rights' in eprint :
106+ metadata ['rightsList' ] = [{'rights' :eprint ['rights' ]}]
106107
107- if 'related_url' in eprint :
108- array = []
109- if isinstance (eprint ['related_url' ]['item' ],list ):
110- for item in eprint ['related_url' ]['item' ]:
108+ if 'related_url' in eprint :
109+ array = []
110+ if isinstance (eprint ['related_url' ]['item' ],list ):
111+ for item in eprint ['related_url' ]['item' ]:
112+ if 'CaltechDATA' in item ['description' ]:
113+ obj = {}
114+ obj ['relationType' ]= 'IsSupplementedBy'
115+ obj ['relatedIdentifierType' ]= 'DOI'
116+ obj ['relatedIdentifier' ]= item ['url' ]
117+ array .append (obj )
118+ else :
119+ item = eprint ['related_url' ]['item' ]
111120 if 'CaltechDATA' in item ['description' ]:
112121 obj = {}
113122 obj ['relationType' ]= 'IsSupplementedBy'
114123 obj ['relatedIdentifierType' ]= 'DOI'
115124 obj ['relatedIdentifier' ]= item ['url' ]
116125 array .append (obj )
117- else :
118- item = eprint ['related_url' ]['item' ]
119- if 'CaltechDATA' in item ['description' ]:
120- obj = {}
121- obj ['relationType' ]= 'IsSupplementedBy'
122- obj ['relatedIdentifierType' ]= 'DOI'
123- obj ['relatedIdentifier' ]= item ['url' ]
124- array .append (obj )
125- metadata ['relatedIdentifiers' ]= array
126+ metadata ['relatedIdentifiers' ]= array
126127
127- #Dates
128- dates = []
129- dates .append ({"date" :datetime .date .today ().isoformat (),"dateType" :"Issued" })
130- if 'gradofc_approval_date' in eprint :
131- dates .append ({"date" :eprint ['gradofc_approval_date' ],"dateType" :"Accepted" })
132- #These are scanned records, we just list when they were made available
133- else :
134- dates .append ({"date" :eprint ['datestamp' ],"dateType" :"Available" })
135- metadata ['dates' ] = dates
128+ #Dates
129+ dates = []
130+ dates .append ({"date" :datetime .date .today ().isoformat (),"dateType" :"Issued" })
131+ if 'gradofc_approval_date' in eprint :
132+ dates .append ({"date" :eprint ['gradofc_approval_date' ],"dateType" :"Accepted" })
133+ #These are scanned records, we just list when they were made available
134+ else :
135+ dates .append ({"date" :eprint ['datestamp' ],"dateType" :"Available" })
136+ metadata ['dates' ] = dates
136137
137- assert schema40 .validate (metadata )
138- #Debugging if this fails
139- #v = schema40.validator.validate(metadata)
140- #errors = sorted(v.iter_errors(instance), key=lambda e: e.path)
141- #for error in errors:
142- # print(error.message)
138+ assert schema40 .validate (metadata )
139+ #Debugging if this fails
140+ #v = schema40.validator.validate(metadata)
141+ #errors = sorted(v.iter_errors(instance), key=lambda e: e.path)
142+ #for error in errors:
143+ # print(error.message)
143144
144- xml = schema40 .tostring (metadata )
145+ xml = schema40 .tostring (metadata )
145146
146- outname = f .split ('.xml' )[0 ]+ '_datacite.xml'
147- outfile = open (outname ,'w' )
148- outfile .write (xml )
147+ outname = f .split ('.xml' )[0 ]+ '_datacite.xml'
148+ outfile = open (outname ,'w' )
149+ outfile .write (xml )
149150
0 commit comments