Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions tagbase_server/tagbase_server/test/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,68 @@ def test_get_dataset_id(self, mock_connect):
tag_id = pu.get_tag_id(cur, 1)
assert tag_id, "1"

@mock.patch("psycopg2.connect")
def test_is_only_metadata_change(self, mock_connect):
metadata_hash_stored = ["some_hash"]
file_md_hash = "some_other_hash"
# result of psycopg2.connect(**connection_stuff)
mock_con = mock_connect.return_value
# result of con.cursor(cursor_factory=DictCursor)
mock_cur = mock_con.cursor.return_value
# return this when calling cur.fetchall()
mock_cur.fetchall.return_value = metadata_hash_stored
conn = psycopg2.connect(
dbname="test",
user="test",
host="localhost",
port="32780",
password="test",
)
cur = conn.cursor()

# if the method returns anything means that metadata found is different
is_only_metadata_change = pu.is_only_metadata_change(
cur, metadata_hash_stored[0], file_md_hash
)
assert is_only_metadata_change, True

# no different metadata found
mock_cur.fetchall.return_value = None
is_only_metadata_change = pu.is_only_metadata_change(
cur, metadata_hash_stored[0], file_md_hash
)
assert is_only_metadata_change, False

@mock.patch("psycopg2.connect")
def test_update_submission_metadata(self, mock_connect):
submission_id = 1
metadata_attributes = [
(submission_id, "instrument_name", "some_instrument"),
(submission_id, "model", "some_model"),
]
# result of psycopg2.connect(**connection_stuff)
mock_con = mock_connect.return_value
# result of con.cursor(cursor_factory=DictCursor)
mock_cur = mock_con.cursor.return_value
# return this when calling cur.fetchall()
mock_cur.fetchall.return_value = metadata_attributes

conn = psycopg2.connect(
dbname="test",
user="test",
host="localhost",
port="32780",
password="test",
)
cur = conn.cursor()
tag_id = 1
dataset_id = 1
metadata_hash = "some_hash"

pu.update_submission_metadata(
cur, tag_id, metadata_attributes, submission_id, dataset_id, metadata_hash
)

@mock.patch("psycopg2.connect")
def test_processing_file_metadata_with_existing_attributes(self, mock_connect):
metadata_attribs_in_db = [[1, "instrument_name"], [2, "model"]]
Expand Down
51 changes: 25 additions & 26 deletions tagbase_server/tagbase_server/utils/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,13 +262,13 @@ def get_dataset_properties(submission_filename):
)


def is_only_metadata_change(cursor, metadata_hash, file_content_hash):
def is_only_metadata_change(cursor, metadata_hash, file_data_hash):
logger.debug("Detecting metadata submitted...")
cursor.execute(
"SELECT md_sha256 FROM submission WHERE md_sha256 <> %s AND data_sha256 = %s ",
(
metadata_hash,
file_content_hash,
file_data_hash,
),
)
db_results = cursor.fetchone()
Expand Down Expand Up @@ -312,27 +312,30 @@ def update_submission_metadata(
):
# update submission information
current_time = dt.now(tz=pytz.utc).astimezone(get_localzone())
cur.execute(
"UPDATE submission SET md_sha256 = '{}', date_time = '{}'"
" WHERE tag_id = {} AND dataset_id = {} AND submission_id = {}".format(
update_submission_info_query = (
"UPDATE submission SET md_sha256 = '{}', date_time = '{}' "
"WHERE tag_id = {} AND dataset_id = {} AND submission_id = {}".format(
metadata_hash, current_time, tag_id, dataset_id, submission_id
)
)
cur.execute(update_submission_info_query)
logger.info(
"Submission_id=%s updated with metadata hash=%s", submission_id, metadata_hash
)

# update metadata attributes
for x in metadata:
submission_id = x[0]
attribute_id = x[1]
attribute_value = x[2]
attribute_value = str(attribute_value).strip('"')
cur.execute(
"UPDATE metadata SET attribute_value = '{}' WHERE submission_id = {} AND tag_id = {} AND attribute_id = {}".format(
attribute_value, submission_id, tag_id, attribute_id
)
# delete previous metadata since we are going to override it
delete_md_query = (
"DELETE FROM metadata WHERE submission_id = {} AND tag_id = {}".format(
submission_id, tag_id
)
)
cur.execute(delete_md_query)
logger.debug(
"Removed old metadata from submission_id=%s tag_id=%s", submission_id, tag_id
)

# insert new metadata
insert_metadata(cur, metadata, submission_id)
logger.info("Updated metadata attributes: %s", metadata)


Expand All @@ -347,7 +350,6 @@ def process_etuff_file(file, version=None, notes=None):
conn = connect()
conn.autocommit = True

# TODO we should read the file once and return the hashes we need (metadata/content/entire-file)
(
instrument_name,
serial_number,
Expand All @@ -359,11 +361,14 @@ def process_etuff_file(file, version=None, notes=None):
number_global_attributes_lines,
) = get_dataset_properties(submission_filename)
content_hash = make_hash_sha256(file_content)
logger.debug("Content Hash: %s", content_hash)
metadata_hash = make_hash_sha256(metadata_content)
logger.debug("MD Hash: %s", metadata_hash)
entire_file_hash = compute_file_sha256(submission_filename)
logger.debug("File Hash: %s", entire_file_hash)
logger.debug(
"Content Hash: %s\tMetadata Hash: %s\tFile Hash: %s",
content_hash,
metadata_hash,
entire_file_hash,
)

with conn:
with conn.cursor() as cur:
Expand Down Expand Up @@ -415,17 +420,11 @@ def process_etuff_file(file, version=None, notes=None):
)
return 1

# at this point we have already read form the file all global attribute lines
proc_obs = []
variable_lookup = {}
# at this point we have already read form the file all global attribute lines
# line_counter = number_global_attributes_lines

# # TODO we should use the 'content' variable in the following
s_time = time.perf_counter()
# with open(file, "rb") as data:
# lines = [line.decode("utf-8", "ignore") for line in data.readlines()]
# lines_length = len(lines)

num_lines_content = len(file_content)
logger.debug(
"len number_global_atttributes_lines: '%s' len lines_length: '%s'",
Expand Down