From a804fb138ad6ebd1ec44e56e2ef42ae0c9f519b7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 4 Feb 2026 13:55:23 +0000 Subject: [PATCH 1/3] Initial plan From 671b8216ba180dfb88aa225829bc364e8e638929 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 4 Feb 2026 14:00:28 +0000 Subject: [PATCH 2/3] Add documents_citations.py module and integrate into dumper Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com> --- publication/documents_citations.py | 242 +++++++++++++++++++++++++++++ publication/dumper.py | 5 +- 2 files changed, 246 insertions(+), 1 deletion(-) create mode 100644 publication/documents_citations.py diff --git a/publication/documents_citations.py b/publication/documents_citations.py new file mode 100644 index 0000000..722e095 --- /dev/null +++ b/publication/documents_citations.py @@ -0,0 +1,242 @@ +# coding: utf-8 +""" +Este processamento gera uma tabulação de citações de cada artigo da +coleção SciELO. +""" +import argparse +import logging +import codecs +import datetime + +import utils +import choices + +logger = logging.getLogger(__name__) + + +def _config_logging(logging_level='INFO', logging_file=None): + + allowed_levels = { + 'DEBUG': logging.DEBUG, + 'INFO': logging.INFO, + 'WARNING': logging.WARNING, + 'ERROR': logging.ERROR, + 'CRITICAL': logging.CRITICAL + } + + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + + logger.setLevel(allowed_levels.get(logging_level, 'INFO')) + + if logging_file: + hl = logging.FileHandler(logging_file, mode='a') + else: + hl = logging.StreamHandler() + + hl.setFormatter(formatter) + hl.setLevel(allowed_levels.get(logging_level, 'INFO')) + + logger.addHandler(hl) + + return logger + + +class Dumper(object): + + def __init__(self, collection, issns=None, output_file=None): + + self._ratchet = utils.ratchet_server() + self._articlemeta = utils.articlemeta_server() + self.collection = collection + self.issns = issns + self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file + header = [] + header.append(u"pid") + header.append(u"scielo_issn") + header.append(u"publication_year") + header.append(u"volume") + header.append(u"source") + header.append(u"doi") + header.append(u"publication_type") + header.append(u"number_or_suppl") + header.append(u"reference_pid") + header.append(u"part_title") + + self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + + def write(self, lines): + + if isinstance(lines, unicode): + lines = [lines] + + for line in lines: + if not self.output_file: + print(line.encode('utf-8')) + else: + self.output_file.write('%s\r\n' % line) + + def run(self): + for item in self.items(): + self.write(item) + logger.info('Export finished') + + def items(self): + + if not self.issns: + self.issns = [None] + + for issn in self.issns: + for data in self._articlemeta.documents(collection=self.collection, issn=issn): + logger.debug('Reading document: %s' % data.publisher_id) + for item in self.fmt_csv(data): + yield item + + def join_line(self, line): + return ','.join(['"%s"' % i.replace('"', '""') for i in line]) + + def fmt_csv(self, data): + """ + Format citation data into CSV rows. + + Columns: + - pid: Document PID + - scielo_issn: SciELO ISSN from the document's journal + - publication_year: Citation publication year + - volume: Citation volume + - source: Citation source (journal title, book title, thesis title, etc.) + - doi: Citation DOI + - publication_type: Citation publication type + - number_or_suppl: Citation issue/number or supplement + - reference_pid: Citation PID (v880) + - part_title: Citation article_title or chapter_title + """ + + # If there are no citations, yield a single empty row for the document + if not data.citations: + line = [] + line.append(data.publisher_id or u'') + line.append(data.journal.scielo_issn or u'') + line.append(u'') + line.append(u'') + line.append(u'') + line.append(u'') + line.append(u'') + line.append(u'') + line.append(u'') + line.append(u'') + yield self.join_line(line) + return + + # Process each citation + for citation in data.citations: + line = [] + + # pid - Document PID + line.append(data.publisher_id or u'') + + # scielo_issn - SciELO ISSN from the document's journal + line.append(data.journal.scielo_issn or u'') + + # publication_year - Citation publication year + pub_year = u'' + if citation.publication_date: + try: + pub_year = unicode(citation.publication_date[0:4]) + except: + pass + # Fallback to v64 if publication_date doesn't work + if not pub_year and 'v64' in citation.data: + pub_year = citation.data['v64'][0].get('_', u'') + line.append(pub_year) + + # volume - Citation volume + line.append(citation.volume or u'') + + # source - Citation source (journal title, book title, thesis title, etc.) + line.append(citation.source or u'') + + # doi - Citation DOI + line.append(citation.doi or u'') + + # publication_type - Citation publication type + line.append(citation.publication_type or u'') + + # number_or_suppl - Citation issue/number + issue = u'' + if citation.issue: + issue = citation.issue + # Check for supplement info in v882 + elif 'v882' in citation.data: + v882_data = citation.data['v882'][0] + if 'n' in v882_data: + issue = v882_data['n'] + elif 's' in v882_data: + issue = v882_data['s'] + line.append(issue) + + # reference_pid - Citation PID from v880 + ref_pid = u'' + if 'v880' in citation.data: + ref_pid = citation.data['v880'][0].get('_', u'') + line.append(ref_pid) + + # part_title - Citation article_title or chapter_title + part_title = u'' + if citation.article_title: + part_title = citation.article_title + elif citation.chapter_title: + part_title = citation.chapter_title + line.append(part_title) + + yield self.join_line(line) + + +def main(): + + parser = argparse.ArgumentParser( + description='Dump citations distribution by article' + ) + + parser.add_argument( + 'issns', + nargs='*', + help='ISSN\'s separated by spaces' + ) + + parser.add_argument( + '--collection', + '-c', + help='Collection Acronym' + ) + + parser.add_argument( + '--output_file', + '-r', + help='File to receive the dumped data' + ) + + parser.add_argument( + '--logging_file', + '-o', + help='Full path to the log file' + ) + + parser.add_argument( + '--logging_level', + '-l', + default='DEBUG', + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + help='Logggin level' + ) + + args = parser.parse_args() + _config_logging(args.logging_level, args.logging_file) + logger.info('Dumping data for: %s' % args.collection) + + issns = None + if len(args.issns) > 0: + issns = utils.ckeck_given_issns(args.issns) + + dumper = Dumper(args.collection, issns, args.output_file) + + dumper.run() diff --git a/publication/dumper.py b/publication/dumper.py index 2d2f973..be5a6f6 100644 --- a/publication/dumper.py +++ b/publication/dumper.py @@ -13,7 +13,8 @@ documents_languages, documents_licenses, documents_authors, - documents_dates + documents_dates, + documents_citations ) @@ -62,6 +63,7 @@ def __init__(self, collection, home_nationality=None, issns=None): self.documents_licenses = documents_licenses.Dumper(collection, output_file='documents_licenses.csv') self.documents_authors = documents_authors.Dumper(collection, output_file='documents_authors.csv') self.documents_dates = documents_dates.Dumper(collection, output_file='documents_dates.csv') + self.documents_citations = documents_citations.Dumper(collection, output_file='documents_citations.csv') if self.home_nationality: self.documents_affiliations_nationality = documents_affiliations_nationality.Dumper(home_nationality, collection, output_file='documents_affiliation_nationality.csv') @@ -79,6 +81,7 @@ def run(self): self.documents_licenses.write(self.documents_licenses.fmt_csv(data)) self.documents_authors.write(self.documents_authors.fmt_csv(data)) self.documents_dates.write(self.documents_dates.fmt_csv(data)) + self.documents_citations.write(self.documents_citations.fmt_csv(data)) if self.home_nationality: self.documents_affiliations_nationality.write(self.documents_affiliations_nationality.fmt_csv(data)) From 1d120e71e3c9144e7207bc803d0260a4a38af032 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 4 Feb 2026 14:01:26 +0000 Subject: [PATCH 3/3] Fix code review issues: specific exception handling and typo Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com> --- publication/documents_citations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/publication/documents_citations.py b/publication/documents_citations.py index 722e095..2ea3296 100644 --- a/publication/documents_citations.py +++ b/publication/documents_citations.py @@ -142,7 +142,7 @@ def fmt_csv(self, data): if citation.publication_date: try: pub_year = unicode(citation.publication_date[0:4]) - except: + except (TypeError, ValueError, AttributeError): pass # Fallback to v64 if publication_date doesn't work if not pub_year and 'v64' in citation.data: @@ -226,7 +226,7 @@ def main(): '-l', default='DEBUG', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], - help='Logggin level' + help='Logging level' ) args = parser.parse_args()