-
Notifications
You must be signed in to change notification settings - Fork 0
/
marcdedup
executable file
·120 lines (94 loc) · 3.06 KB
/
marcdedup
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
if [[ -z $1 ]];then
echo "Usage: marcdedup [filename] [tag]"
echo "Match is on 001 unless and subfield is specified, e.g. 907a"
exit
else
infile="${1}"
idtag="${2}"
echo
if [[ -z $idtag ]];then idtag="001"; echo "No tag and subfield specified";fi
echo "Match will be performed on ${idtag}"
echo
fi
fileroot=$(echo "${infile}" | sed 's/\.....\?$//')
read -r -d '' awkscript << "ENDOFAWK"
#!/usr/bin/awk -f
BEGIN { records_found = 0
missing = 0
duprecs = 0
if (length("а") != 2) {
printf("Your version of awk does not support marcsearch -- you need a version that supports the -b switch\\n")
exit
}
id_field = IDTAG
id_subfield = ""
marc_field = substr(MARCTAG, 1, 3)
marc_subfield = substr(MARCTAG, 4, 1)
}
{
record_id=search_field=search_subfield=""
leader=substr($0,1,24)
baseaddress=substr(leader,13, 5) + 0
directory=substr($0,25, baseaddress - 25)
directory_length=length(directory)
directory_check=(directory_length % 12)
record_content=substr($0, baseaddress + 1)
########### extract the id
if (directory_check == 0 && record_id == "") {
if (length(IDTAG) == 4) {
id_field = substr(IDTAG, 1, 3)
id_subfield = substr(IDTAG, 4, 1)
}
record_id = ""
good_record = 0
for (i=1; i<=directory_length; i=i+12) {
if (substr(directory, i, 3) == id_field) {
field_length = substr(directory, i + 3, 4) + 0
starting_pos = substr(directory, i + 7, 5)
field_content = ""
if (substr(record_content, starting_pos + 3, 1) == SFS) {
field_content = substr(record_content, starting_pos + 3, field_length - 3)
} else {
field_content = substr(record_content, starting_pos + 1, field_length - 1)
}
record_id = field_content
if (id_subfield != "") {
split(record_id, subfields, SFS)
for (subfield = 2; subfield <= length(subfields); subfield++) {
if (substr(subfields[subfield], 1, 1) == id_subfield) {
record_id = substr(subfields[subfield],2)
}
break
}
}
gsub(/^ *+| +$/, "", record_id)
}
}
if (record_id == "") {
print $0 > "missing_record_ids.mrc"
missing++
} else {
if (!seen[record_id]) { good_record = 1; seen[record_id] = 1 }
if (good_record == 1) {
uniquerecs++
print $0 > "unique_recs.mrc"
} else {
duprecs++
printf "%s\\n", record_id > "duplicates.txt"
}
}
if (NR % 10000 == 0){ printf "Records searched: "NR" with %d unique records and %d duplicates and %d records missing ids\\r", uniquerecs, duprecs, missing++ }
}
}
END {
printf "Records searched: "NR" with %d unique records and %d duplicates and %d records missing ids\\n\\n", uniquerecs, duprecs, missing++
printf "Unique records output to unique_recs.mrc\\n"
if (missing > 0) { printf "MARC records missing identifiers output to missing_identifiers.mrc\\n"}
if (duprecs > 0) { printf "Identifiers for duplicate records output to duplicates.txt\\n"}
}
ENDOFAWK
echo -e "${awkscript}" > tmp_marcdedup
chmod 700 tmp_marcdedup
awk -v RS=$'\x1d' -v ORS=$'\x1d' -v FS=$'\x1e' -v SFS=$'\x1f' -v MARCTAG=${marctag} -v IDTAG="${idtag}" -b -f tmp_marcdedup "${infile}"
rm -f tmp_marcdedup
echo