Python scripting to automate fetching data from RDAS.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

4 年之前
4 年之前
4 年之前
4 年之前
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. import requests, time, os, sys, json, csv
  2. from bs4 import BeautifulSoup
  3. class DataTable:
  4. def __init__(self):
  5. self.table = {}
  6. self.cols = {}
  7. self.rows = {}
  8. class DataPocket:
  9. def __init__(self):
  10. self.rows = {}
  11. def addRow(self, rownum, name=None, values=None):
  12. ### values should be a list in column order
  13. rownum = int(rownum)
  14. d = {'name': name,
  15. 'values': [] if values == None else [x for x in values]
  16. }
  17. self.rows[rownum] = d
  18. def addName(self, rownum, name):
  19. if int(rownum) in self.rows.keys():
  20. self.rows[rownum]['name'] = name
  21. else:
  22. print("No such row.")
  23. def addValues(self, rownum, values):
  24. ### values should be a list in column order
  25. if int(rownum) in self.rows.keys():
  26. self.rows[rownum]['values'] = [x for x in values]
  27. else:
  28. print("No such row.")
  29. def setValue(self, rownum, value, col):
  30. ### col should be a zero-indexed integer
  31. col = int(col)
  32. if int(rownum) in self.rows.keys():
  33. try:
  34. self.rows[rownum]['values'][col] = value
  35. except IndexError:
  36. while len(self.rows[rownum]['values'] < col):
  37. self.rows[rownum]['values'].append(None)
  38. self.rows[rownum]['values'].append(value)
  39. else:
  40. print("No such row.")
  41. def printRow(self, rownum):
  42. if int(rownum) in self.rows.keys():
  43. print("Row {} (\"{}\"): {}".format(rownum, self.rows[rownum]['name'], self.rows[rownum]['values']))
  44. else:
  45. print("No such row.")
  46. def printData(self):
  47. for row in self.rows.keys():
  48. self.printRow(row)
  49. class DataColl:
  50. def __init__(self):
  51. self._year = None
  52. self._row = None
  53. self._col = None
  54. self._control = None
  55. self._cf_eq = None
  56. self._control_filter = None
  57. self._weight = None
  58. self._chisq = None
  59. self._fmt = None
  60. @property
  61. def year(self):
  62. return self._year
  63. @year.setter
  64. def year(self, year):
  65. # if int(year) < 2014:
  66. # print("NSDUH does not have data from before 2014. Setting to 2014.")
  67. # self._year = 2014
  68. self._year = int(year)
  69. self._generate()
  70. @property
  71. def row(self):
  72. return self._row
  73. @row.setter
  74. def row(self, r):
  75. self._row = r
  76. self._generate()
  77. @property
  78. def col(self):
  79. return self._col
  80. @col.setter
  81. def col(self, c):
  82. self._col = c
  83. self._generate()
  84. @property
  85. def control(self):
  86. return self._control
  87. @control.setter
  88. def control(self, ctl):
  89. self._control = ctl
  90. self._generate()
  91. @property
  92. def cf_eq(self):
  93. return self._cf_eq
  94. @cf_eq.setter
  95. def cf_eq(self, eq):
  96. self._cf_eq = eq
  97. self._generate()
  98. @property
  99. def control_filter(self):
  100. return self._control_filter
  101. @control_filter.setter
  102. def control_filter(self, filter):
  103. self._control_filter = filter
  104. self._generate()
  105. @property
  106. def weight(self):
  107. return self._weight
  108. @weight.setter
  109. def weight(self, wgt):
  110. self._weight = wgt
  111. self._generate()
  112. @property
  113. def chisq(self):
  114. return self._chisq
  115. @chisq.setter
  116. def chisq(self, cs):
  117. self._chisq = cs
  118. self._generate()
  119. @property
  120. def fmt(self):
  121. return self._fmt
  122. @fmt.setter
  123. def fmt(self, f):
  124. self._fmt = f
  125. self._generate()
  126. @property
  127. def endpoint(self):
  128. return self._endpoint
  129. def _generate(self):
  130. self.setEndpoint(False)
  131. self.setFilename(False)
  132. def setEndpoint(self, loud=True):
  133. if loud and self._year == None or self._row == None or self._col == None:
  134. print("Year, row, and column must be set in order to build an endpoint.")
  135. self._endpoint = None
  136. lcontrol = "" if self.control == None else "&control={}".format(self.control)
  137. lcfnot = "" if self.cf_eq == True else "!"
  138. lcontrol_filter = "" if (self.control_filter == None or lcontrol == "") else "&filter={}{}%3D{}".format(self.control, lcfnot, self.control_filter)
  139. lweight = "" if self.weight == None else "&weight={}".format(self.weight)
  140. lchisq = "&run_chisq=false" if self.chisq == False or self.chisq == None else "&run_chisq=true"
  141. lfmt = "json" if (self.fmt == None or self.fmt not in ["json", "msgpack", "api"]) else self.fmt
  142. year_rng = "{}-{}".format(int(self.year), int(self.year)+1)
  143. self._endpoint = "https://rdas.samhsa.gov/api/surveys/NSDUH-{}-RD02YR/crosstab/?row={}&column={}{}{}{}{}&format={}".format(
  144. year_rng,
  145. self.row,
  146. self.col,
  147. lcontrol,
  148. lcontrol_filter,
  149. lweight,
  150. lchisq,
  151. lfmt
  152. )
  153. @property
  154. def filename(self):
  155. return self._filename
  156. def setFilename(self, loud=True, ext=None):
  157. if loud and self._year == None or self._row == None or self._col == None:
  158. print("Year, row, and column must be set in order to build a filename.")
  159. self._filename = None
  160. lext = "csv" if ext == None else ext
  161. lcontrol = "" if self.control == None else "_ctl_{}".format(self.control)
  162. lcfnot = "" if self.cf_eq == True else "n"
  163. lcontrol_filter = "" if (self.control_filter == None or lcontrol == "") else "_{}eq_{}".format(lcfnot, self.control_filter)
  164. lweight = "" if self.weight == None else "_weight_{}".format(self.weight)
  165. lchisq = "_chisq_false" if self.chisq == False or self.chisq == None else "_chisq_true"
  166. self._filename = "NSDUH_{}_{}_vs_{}{}{}{}{}.{}".format(
  167. self.year,
  168. self.row,
  169. self.col,
  170. lcontrol,
  171. lcontrol_filter,
  172. lweight,
  173. lchisq,
  174. lext
  175. )
  176. def toString(self):
  177. print("Data Collector:")
  178. print("Row: {}, Column: {}".format(self.row, self.col))
  179. print("Controlling on {} {}= {}".format(self.control, self.control_filter[0], self.control_filter[1]))
  180. print("Weighted by {}".format(self.weight))
  181. print("Generating Chi-Squared" if self.chisq else "Not generating Chi-Squared")
  182. print("Formatting as {}".format(self.fmt))
  183. print("URL: {}".format(self.endpoint))
  184. def main():
  185. df = DataColl()
  186. df.year = 2017
  187. df.row = "STATE"
  188. df.col = "YOSELL2"
  189. df.control = "CATAG18"
  190. df.cf_eq = True
  191. df.control_filter = "2"
  192. df.weight = "DASWT_1"
  193. df.chisq = False
  194. df.fmt = "json"
  195. print(df.endpoint)
  196. df.setFilename(False, "csv")
  197. r = requests.get(df.endpoint)
  198. rjson = r.json() # this creates a Python object, not a JSON string
  199. # in results/cells, column_option and row_option refer to the key field, not the list index
  200. # in column_option and row_option, missing value is coded as "", but coded as "." in key field
  201. # What are the values of "column_option" and "row_option"?
  202. # rc = rjson["results"]["cells"]
  203. # for i in range(10):
  204. # print(rc[i])
  205. # cols = []
  206. # rows = []
  207. # for v in rc:
  208. # print(v.keys())
  209. # if v["column_option"] not in cols:
  210. # cols.append(v["column_option"])
  211. # if v["row_option"] not in rows:
  212. # rows.append(v["row_option"])
  213. # print(f"Cols: {sorted(cols)}")
  214. # print(f"Rows: {sorted(rows)}")
  215. # This just confirms that there are, in fact, 51 states incl. DC in this query
  216. # ro = rjson["results"]["row"]["options"]
  217. # titles = {}
  218. # for v in ro:
  219. # titles[int(v["key"])] = (v["title"])
  220. # for t in sorted(titles.keys()):
  221. # print(f"{titles[t]}")
  222. if __name__ == "__main__":
  223. main()