8000 segrnn/BIO_TAG.py at master · pywirrarika/segrnn · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content
{"payload":{"allShortcutsEnabled":false,"fileTree":{"":{"items":[{"name":"data","path":"data","contentType":"directory"},{"name":"etc","path":"etc","contentType":"directory"},{"name":".gitignore","path":".gitignore","contentType":"file"},{"name":"BIO_TAG.py","path":"BIO_TAG.py","contentType":"file"},{"name":"BIO_TAG_inference.py","path":"BIO_TAG_inference.py","contentType":"file"},{"name":"README.md","path":"README.md","contentType":"file"},{"name":"config.py","path":"config.py","contentType":"file"},{"name":"evaluate.py","path":"evaluate.py","contentType":"file"},{"name":"model.py","path":"model.py","contentType":"file"},{"name":"originaldata_train.sh","path":"originaldata_train.sh","contentType":"file"},{"name":"preproc.py","path":"preproc.py","contentType":"file"},{"name":"seg_rnn.py","path":"seg_rnn.py","contentType":"file"},{"name":"train.sh","path":"train.sh","contentType":"file"},{"name":"viterbi.py","path":"viterbi.py","contentType":"file"}],"totalCount":14}},"fileTreeProcessingTime":32.520347,"foldersToFetch":[],"incompleteFileTree":false,"repo":{"id":143797761,"defaultBranch":"master","name":"segrnn","ownerLogin":"pywirrarika","currentUserCanPush":false,"isFork":true,"isEmpty":false,"createdAt":"2018-08-07T00:27:18.000Z","ownerAvatar":"https://avatars.githubusercontent.com/u/457373?v=4","public":true,"private":false,"isOrgOwned":false},"codeLineWrapEnabled":false,"symbolsExpanded":false,"treeExpanded":true,"refInfo":{"name":"master","listCacheKey":"v0:1615096952.458591","canEdit":false,"refType":"branch","currentOid":"fc32fa1a0ac6e8d3e354225c949c85af4bd40510"},"path":"BIO_TAG.py","currentUser":null,"blob":{"rawLines":["#open('/Users/lmy/Dropbox/Personal/Coursework/CIS700-006/Project/POS_tagger_trained_on_Universal_Dependency_French_corpus/file.txt').read().decode('utf-8').split()\r","\r","# -*- coding: utf-8 -*-\r","from __future__ import unicode_literals\r","from __future__ import print_function\r","import codecs \r","from sklearn.feature_extraction import DictVectorizer\r","from sklearn.pipeline import Pipeline\r","from sklearn.linear_model import LogisticRegression\r","from sklearn.metrics import classification_report\r","\r","import argparse\r","\r","parser = argparse.ArgumentParser(description='Logistic regression BIO.')\r","parser.add_argument('--train', help='Training file')\r","parser.add_argument('--test', help='Test file')\r","parser.add_argument('--embed', help='Character embedding file')\r","args = parser.parse_args()\r","\r","train_path = args.train\r","test_path = args.test\r","\r","def features(sentence, index):\r"," #\"\"\" sentence: [w1, w2, ...], index: the index of the word \"\"\"\r"," return {\r"," 'word': sentence[index],\r"," 'is_first': index == 0,\r"," 'is_last': index == len(sentence) - 1,\r"," 'prev_word': '\u003cs\u003e' if index == 0 else sentence[index - 1],\r"," 'next_word': '\u003c/s\u003e' if index == len(sentence) - 1 else sentence[index + 1],\r"," 'is_numeric': sentence[index].isdigit(),\r"," 'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]\r"," }\r"," \r","def untag(tagged_sentence):\r"," return [w for w, t in tagged_sentence] \r"," \r","def gen_corpus(path):\r"," doc = []\r"," tagset = set()\r"," file = codecs.open(path, encoding='utf-8') \r"," #with open(path, encoding='utf-8') as file:\r"," for line in file:\r"," if line[0].isdigit():\r"," features = line.split()\r"," word, pos= features[1], features[3] \r"," if pos != \"_\":\r"," if(len(word)\u003e1):\r"," tagset.add('B'+pos)\r"," tagset.add('I'+pos)\r"," for order in range(len(word)):\r"," if(order==0):\r"," doc.append((word[order], 'B'+pos)) \r"," if(order!=0):\r"," doc.append((word[order], 'I'+pos))\r"," else:\r"," tagset.add('B'+pos)\r"," doc.append((word, 'B'+pos))\r"," \r"," elif len(line.strip()) == 0:\r"," if len(doc) \u003e 0:\r"," words, tags = zip(*doc)\r"," yield (list(words), list(tags))\r"," doc = []\r"," \r","def transform_to_dataset(tagged_sentences):\r"," X, y = [], []\r"," for words, tags in tagged_sentences:\r"," for index, word in enumerate(words):\r"," X.append(features(words, index))\r"," y.append(tags[index])\r"," return X, y\r"," \r","def evaluation(TEST_DATA):\r"," y_pred, y_true = [], []\r"," for words, tags in TEST_DATA:\r"," for i, (word, pos) in enumerate(pos_tag(words)):\r"," y_pred.append(pos)\r"," y_true.append(tags[i])\r"," return y_pred, y_true\r"," \r","def pos_tag(sentence):\r"," tags = clf.predict([features(sentence, index) for index in range(len(sentence))])\r"," return zip(sentence, tags)\r"," \r"," \r","\r","training_sentences = list(gen_corpus(train_path))\r","X, y = transform_to_dataset(training_sentences)\r","clf = Pipeline([\r"," ('vectorizer', DictVectorizer(sparse=True)),\r"," ('classifier', LogisticRegression(n_jobs=4, max_iter=200, verbose=True))\r","])\r","clf.fit(X, y)\r","\r","import pickle\r","pickle.dump(clf, open(\"log_regression.p\", \"wb\"))\r","\r","test_sentences = list(gen_corpus(test_path))\r","X_test, y_test = transform_to_dataset(test_sentences)\r","print( \"Accuracy:\", clf.score(X_test, y_test))\r","\r","\r","y_pred, y_true = evaluation(test_sentences)\r","for l in classification_report(y_true, y_pred).split('\\n'):\r"," print(l)\r","\r","t = \"今天天气非常好。\"\r","print(list(pos_tag(t)))\r"],"stylingDirectives":null,"colorizedLines":null,"csv":null,"csvError":null,"dependabotInfo":{"showConfigurationBanner":false,"configFilePath":null,"networkDependabotPath":"/pywirrarika/segrnn/network/updates","dismissConfigurationNoticePath":"/settings/dismiss-notice/dependabot_configuration_notice","configurationNoticeDismissed":null},"displayName":"BIO_TAG.py","displayUrl":"https://github.com/pywirrarika/segrnn/blob/master/BIO_TAG.py?raw=true","headerInfo":{"blobSize":"3.73 KB","deleteTooltip":"You must be signed in to make or propose changes","editTooltip":"You must be signed in to make or propose changes","ghDesktopPath":"https://desktop.github.com","isGitLfs":false,"onBranch":true,"shortPath":"9f4f49b","siteNavLoginPath":"/login?return_to=https%3A%2F%2Fgithub.com%2Fpywirrarika%2Fsegrnn%2Fblob%2Fmaster%2FBIO_TAG.py","isCSV":false,"isRichtext":false,"toc":null,"lineInfo":{"truncatedLoc":"109","truncatedSloc":"90"},"mode":"file"},"image":false,"isCodeownersFile":null,"isPlain":false,"isValidLegacyIssueTemplate":false,"issueTemplate":null,"discussionTemplate":null,"language":"Python","languageID":303,"large":false,"planSupportInfo":{"repoIsFork":null,"repoOwnedByCurrentUser":null,"requestFullPath":"/pywirrarika/segrnn/blob/master/BIO_TAG.py","showFreeOrgGatedFeatureMessage":null,"showPlanSupportBanner":null,"upgradeDataAttributes":null,"upgradePath":null},"publishBannersInfo":{"dismissActionNoticePath":"/settings/dismiss-notice/publish_action_from_dockerfile","releasePath":"/pywirrarika/segrnn/releases/new?marketplace=true","showPublishActionBanner":false},"rawBlobUrl":"https://github.com/pywirrarika/segrnn/raw/refs/heads/master/BIO_TAG.py","renderImageOrRaw":false,"richText":null,"renderedFileInfo":null,"shortPath":null,"symbolsEnabled":true,"tabSize":8,"topBannersInfo":{"overridingGlobalFundingFile":false,"globalPreferredFundingPath":null,"showInvalidCitationWarning":false,"citationHelpUrl":"https://docs.github.com/github/creating-cloning-and-archiving-repositories/creating-a-repository-on-github/about-citation-files","actionsOnboardingTip":null},"truncated":false,"viewable":true,"workflowRedirectUrl":null,"symbols":{"timed_out":false,"not_analyzed":false,"symbols":[{"name":"parser","kind":"constant","ident_start":507,"ident_end":513,"extent_start":507,"extent_end":579,"fully_qualified_name":"parser","ident_utf16":{"start":{"line_number":13,"utf16_col":0},"end":{"line_number":13,"utf16_col":6}},"extent_utf16":{"start":{"line_number":13,"utf16_col":0},"end":{"line_number":13,"utf16_col":72}}},{"name":"args","kind":"constant","ident_start":749,"ident_end":753,"extent_start":749,"extent_end":775,"fully_qualified_name":"args","ident_utf16":{"start":{"line_number":17,"utf16_col":0},"end":{"line_number":17,"utf16_col":4}},"extent_utf16":{"start":{"line_number":17,"utf16_col":0},"end":{"line_number":17,"utf16_col":26}}},{"name":"train_path","kind":"constant","ident_start":779,"ident_end":789,"extent_start":779,"extent_end":802,"fully_qualified_name":"train_path","ident_utf16":{"start":{"line_number":19,"utf16_col":0},"end":{"line_number":19,"utf16_col":10}},"extent_utf16":{"start":{"line_number":19,"utf16_col":0},"end":{"line_number":19,"utf16_col":23}}},{"name":"test_path","kind":"constant","ident_start":804,"ident_end":813,"extent_start":804,"extent_end":825,"fully_qualified_name":"test_path","ident_utf16":{"start":{"line_number":20,"utf16_col":0},"end":{"line_number":20,"utf16_col":9}},"extent_utf16":{"start":{"line_number":20,"utf16_col":0},"end":{"line_number":20,"utf16_col":21}}},{"name":"features","kind":"function","ident_start":833,"ident_end":841,"extent_start":829,"extent_end":1345,"fully_qualified_name":"features","ident_utf16":{"start":{"line_number":22,"utf16_col":4},"end":{"line_number":22,"utf16_col":12}},"extent_utf16":{"start":{"line_number":22,"utf16_col":0},"end":{"line_number":32,"utf16_col":5}}},{"name":"untag","kind":"function","ident_start":1357,"ident_end":1362,"extent_start":1353,"extent_end":1424,"fully_qualified_name":"untag","ident_utf16":{"start":{"line_number":34,"utf16_col":4},"end":{"line_number":34,"utf16_col":9}},"extent_utf16":{"start":{"line_number":34,"utf16_col":0},"end":{"line_number":35,"utf16_col":42}}},{"name":"gen_corpus","kind":"function","ident_start":1437,"ident_end":1447,"extent_start":1433,"extent_end":2460,"fully_qualified_name":"gen_corpus","ident_utf16":{"start":{"line_number":37,"utf16_col":4},"end":{"line_number":37,"utf16_col":14}},"extent_utf16":{"start":{"line_number":37,"utf16_col":0},"end":{"line_number":63,"utf16_col":20}}},{"name":"transform_to_dataset","kind":"function","ident_start":2480,"ident_end":2500,"extent_start":2476,"extent_end":2725,"fully_qualified_name":"transform_to_dataset","ident_utf16":{"start":{"line_number":65,"utf16_col":4},"end":{"line_number":65,"utf16_col":24}},"extent_utf16":{"start":{"line_number":65,"utf16_col":0},"end":{"line_number":71,"utf16_col":15}}},{"name":"evaluation","kind":"function","ident_start":2745,"ident_end":2755,"extent_start":2741,"extent_end":2984,"fully_qualified_name":"evaluation","ident_utf16":{"start":{"line_number":73,"utf16_col":4},"end":{"line_number":73,"utf16_col":14}},"extent_utf16":{"start":{"line_number":73,"utf16_col":0},"end":{"line_number":79,"utf16_col":25}}},{"name":"pos_tag","kind":"function","ident_start":2996,"ident_end":3003,"extent_start":2992,"extent_end":3133,"fully_qualified_name":"pos_tag","ident_utf16":{"start":{"line_number":81,"utf16_col":4},"end":{"line_number":81,"utf16_col":11}},"extent_utf16":{"start":{"line_number":81,"utf16_col":0},"end":{"line_number":83,"utf16_col":30}}},{"name":"training_sentences","kind":"constant","ident_start":3149,"ident_end":3167,"extent_start":3149,"extent_end":3198,"fully_qualified_name":"training_sentences","ident_utf16":{"start":{"line_number":87,"utf16_col":0},"end":{"line_number":87,"utf16_col":18}},"extent_utf16":{"start":{"line_number":87,"utf16_col":0},"end":{"line_number":87,"utf16_col":49}}},{"name":"clf","kind":"constant","ident_start":3249,"ident_end":3252,"extent_start":3249,"extent_end":3398,"fully_qualified_name":"clf","ident_utf16":{"start":{"line_number":89,"utf16_col":0},"end":{"line_number":89,"utf16_col":3}},"extent_utf16":{"start":{"line_number":89,"utf16_col":0},"end":{"line_number":92,"utf16_col":2}}},{"name":"test_sentences","kind":"constant","ident_start":3484,"ident_end":3498,"extent_start":3484,"extent_end":3528,"fully_qualified_name":"test_sentences","ident_utf16":{"start":{"line_number":98,"utf16_col":0},"end":{"line_number":98,"utf16_col":14}},"extent_utf16":{"start":{"line_number":98,"utf16_col":0},"end":{"line_number":98,"utf16_col":44}}},{"name":"t","kind":"constant","ident_start":3759,"ident_end":3760,"extent_start":3759,"extent_end":3789,"fully_qualified_name":"t","ident_utf16":{"start":{"line_number":107,"utf16_col":0},"end":{"line_number":107,"utf16_col":1}},"extent_utf16":{"start":{"line_number":107,"utf16_col":0},"end":{"line_number":107,"utf16_col":14}}}]}},"copilotInfo":null,"copilotAccessAllowed":false,"modelsAccessAllowed":false,"modelsRepoIntegrationEnabled":false,"csrf_tokens":{"/pywirrarika/segrnn/branches":{"post":"d-Wb2H8dmm3krQhUSEDYFmXLhLNkXFTx8TOZ7ljDmLs_Ic7DZMzIP2kCoLHL1RO9VEcqFQ8QmuCEOCr56MNp_A"},"/repos/preferences":{"post":"ampVdIsHFkP0rSwZZDNoZX3pNKbH8hbVCxyFCT9PdHyKtTuKHsdOlj1TtcOTL3Y0YIZHceG02XihC4rW0jFS3A"}}},"title":"segrnn/BIO_TAG.py at master · pywirrarika/segrnn","appPayload":{"helpUrl":"https://docs.github.com","findFileWorkerPath":"/assets-cdn/worker/find-file-worker-7d7eb7c71814.js","findInFileWorkerPath":"/assets-cdn/worker/find-in-file-worker-1ae9fa256942.js","githubDevUrl":null,"enabled_features":{"code_nav_ui_events":false,"react_blob_overlay":false,"accessible_code_button":true,"github_models_repo_integration":false}}}
0