Project

General

Profile

« Previous | Next » 

Revision 7414

include John Kunze's latest suggestions for improved metadata -- a lot of clean-up, especially on characters in the file. Note UTF-8 encoding of the script.

View differences:

src/doi_registration.sql
629 629
end
630 630
where publisher = creator;
631 631

  
632
-- entity references
633
update doi_registration
634
set publisher = regexp_replace(publisher, '&', '&', 'g');
635
update doi_registration
636
set publisher = regexp_replace(publisher, ''', E'\'', 'g');
637
--accented i (í)
638
update doi_registration
639
set publisher = regexp_replace(publisher, 'í', 'í', 'g')
640
where publisher like '%í%';
641
update doi_registration
642
set creator = regexp_replace(creator, '&', '&', 'g');
643
update doi_registration
644
set creator = regexp_replace(creator, ''', E'\'', 'g');
645

  
646
--titles
647
update doi_registration
648
set title = regexp_replace(title, '&', '&', 'g');
649
update doi_registration
650
set title = regexp_replace(title, '∓', '&', 'g')
651
where title like '%∓%';
652
update doi_registration
653
set title = regexp_replace(title, ''', E'\'', 'g');
654
update doi_registration
655
set title = regexp_replace(title, '"', E'\'', 'g');
656
update doi_registration
657
set title = regexp_replace(title, '>', '>', 'g');
658
update doi_registration
659
set title = regexp_replace(title, '&lt;', '<', 'g');
660
update doi_registration
661
set title = regexp_replace(title, '&#237;', 'í', 'g')
662
where title like '%&#237;%';
663
update doi_registration
664
set title = regexp_replace(title, '&#195;&#173;', 'í', 'g')
665
where title like '%&#195;&#173;%';
666
update doi_registration
667
set title = regexp_replace(title, '&#195;&#177;', 'í', 'g')
668
where title like '%&#195;&#177;%';
669
update doi_registration
670
set title = regexp_replace(title, '&#243;', 'ó', 'g')
671
where title like '%&#243;%';
672
update doi_registration
673
set title = regexp_replace(title, '&#227;', 'ã', 'g')
674
where title like '%&#227;%';
675
update doi_registration
676
set title = regexp_replace(title, '&#231;', 'ç', 'g')
677
where title like '%&#231;%';
678
update doi_registration
679
set title = regexp_replace(title, '&#233;', 'é', 'g')
680
where title like '%&#233;%';
681
update doi_registration
682
set title = regexp_replace(title, '&#241;', 'ñ', 'g')
683
where title like '%&#241;%';
684
update doi_registration
685
set title = regexp_replace(title, '&#226;&#128;&#147;', '-', 'g')
686
where title like '%&#226;&#128;&#147;%';
687
update doi_registration
688
set title = regexp_replace(title, '&#226;&#128;&#153;', E'\'', 'g')
689
where title like '%&#226;&#128;&#153;%';
690
update doi_registration
691
set title = regexp_replace(title, '&#8211;', '/', 'g')
692
where title like '%&#8211;%';
693
update doi_registration
694
set title = regexp_replace(title, '&#195;&#161;', 'á', 'g')
695
where title like '%&#195;&#161;%';
696
update doi_registration
697
set title = regexp_replace(title, '&#195;&#179;', 'ó', 'g')
698
where title like '%&#195;&#179;%';
699
--&#26684;&#24335;&#31684;&#20363; (格式范例)
700
update doi_registration
701
set title = regexp_replace(title, '&#26684;&#24335;&#31684;&#20363;', '格式范例', 'g')
702
where title like '%&#26684;&#24335;&#31684;&#20363;%';
703

  
704
--directional quotes
705
update doi_registration
706
set title = regexp_replace(title, '&#226;&#128;&#157;', '"', 'g')
707
where title like '%&#226;&#128;&#157;%';
708
update doi_registration
709
set title = regexp_replace(title, '&#226;&#128;&#156;', '"', 'g')
710
where title like '%&#226;&#128;&#156;%';
711
--one off apostrophe
712
update doi_registration
713
set title = regexp_replace(title, '&#213;', E'\'', 'g')
714
where title like '%&#213;%';
715
update doi_registration
716
set title = regexp_replace(title, '&#150;', '-', 'g')
717
where title like '%&#150;%';
718

  
719
--after reading the complete metadata, I could interpret these as "greater than"
720
update doi_registration
721
set title = regexp_replace(title, '&t;', '>', 'g')
722
where title like '%&t;%'
723
and guid in ('doi:10.6073/AA/knb-lter-luq.15.1', 'doi:10.6073/AA/knb-lter-luq.39.1', 'doi:10.6073/AA/knb-lter-luq.37.1', 'doi:10.6073/AA/knb-lter-luq.40.1', 'doi:10.6073/AA/knb-lter-luq.18.1' );
724

  
725

  
726
--special characters
727
update doi_registration
728
set creator = regexp_replace(creator, '&#233;', 'é', 'g')
729
where creator like '%&#233;%';
730
update doi_registration
731
set creator = regexp_replace(creator, '&#231;', 'ç', 'g')
732
where creator like '%&#231;%';
733
update doi_registration
734
set creator = regexp_replace(creator, '&#241;', 'ñ', 'g')
735
where creator like '%&#241;%';
736
-- Guti&#195;&#169;rrez, Ralph, J.
737
-- Li&#195;&#169;bault,  Fr&#195;&#169;d&#195;&#169;ric
738
update doi_registration
739
set creator = regexp_replace(creator, '&#195;&#169;', 'é', 'g')
740
where creator like '%&#195;&#169;%';
741
-- Ram&#195;&#173;rez, Alonso
742
update doi_registration
743
set creator = regexp_replace(creator, '&#195;&#173;', 'í', 'g')
744
where creator like '%&#195;&#173;%';
745
-- Gonz&#195;&#161;lez, Grizelle
746
update doi_registration
747
set creator = regexp_replace(creator, '&#195;&#161;', 'á', 'g')
748
where creator like '%&#195;&#161;%';
749
--&#29579;,  (王, 名, 姓)
750
update doi_registration
751
set creator = regexp_replace(creator, '&#29579;', '王', 'g')
752
where creator like '%&#29579;%';
753
--&#21517;
754
update doi_registration
755
set creator = regexp_replace(creator, '&#21517;', '名', 'g')
756
where creator like '%&#21517;%';
757
--&#22995;
758
update doi_registration
759
set creator = regexp_replace(creator, '&#22995;', '姓', 'g')
760
where creator like '%&#22995;%';
761
-- These are left
762
-- Helmbrecht, S &#226;&#128;&#153;rai
763
-- lk&#195;&#177;lkl, Fgfggf
764

  
765
-- clean up
766
update doi_registration
767
set publisher = replace(publisher, E'\n', ' ');
768
update doi_registration
769
set publisher = regexp_replace(publisher, E'\\s+', ' ', 'g');
770
update doi_registration
771
set publisher = regexp_replace(publisher, E'^\\s+', '', 'g');
772

  
773
-- use only year for pub_date
774
update doi_registration
775
set pub_date = to_char(date(pub_date), 'YYYY')
776
where length(pub_date) > 4
777
and pub_date like '%-%-%';
778

  
779
--set the rest to upload date, xml_documents
780
update doi_registration doi
781
set pub_date = to_char(date_created, 'YYYY')
782
from xml_documents x, identifier id
783
where length(pub_date) > 4
784
and doi.guid = id.guid
785
and id.docid = x.docid
786
and id.rev = x.rev;
787
-- revisions
788
update doi_registration doi
789
set pub_date = to_char(date_created, 'YYYY')
790
from xml_revisions x, identifier id
791
where length(pub_date) > 4
792
and doi.guid = id.guid
793
and id.docid = x.docid
794
and id.rev = x.rev;
795

  
632 796
-- copy to the external file
633 797
COPY 
634 798
(select ezid_account, 

Also available in: Unified diff